{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Streetcar Delay Prediction - Data Preparation\n",
    "\n",
    "Use dataset covering Toronto Transit Commission (TTC) streetcar delays 2014 - present to predict future delays and come up with recommendations for avoiding delays.\n",
    "\n",
    "Source dataset: https://open.toronto.ca/dataset/ttc-streetcar-delay-data/\n",
    "\n",
    "This notebook contains the common data loading and preparation steps:\n",
    "- load data from all the tabs of all the XLS files into a single dataframe\n",
    "- correct type issues\n",
    "- fix missing values\n",
    "- clean up anomalies in the route, location, direction and vehicle columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Streetcar routes\n",
    "\n",
    "From https://www.ttc.ca/Routes/Streetcars.jsp\n",
    "\n",
    "<table style=\"border: none\" align=\"left\">\n",
    "   </tr>\n",
    "   <tr style=\"border: none\">\n",
    "       <th style=\"border: none\"><img src=\"https://raw.githubusercontent.com/ryanmark1867/streetcarnov3/master/streetcar%20routes.jpg\" width=\"600\" alt=\"Icon\"> </th>\n",
    "   </tr>\n",
    "</table>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Common imports and variables\n",
    "Imports and variable definitions that are common to the entire notebook\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: requests in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (2.22.0)\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2.8)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (3.0.4)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (2019.6.16)\n",
      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (from requests) (1.25.3)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 19.0.3, however version 20.1 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: xlrd in c:\\users\\ryanm\\appdata\\local\\programs\\python\\python37\\lib\\site-packages (1.2.0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You are using pip version 19.0.3, however version 20.1 is available.\n",
      "You should consider upgrading via the 'python -m pip install --upgrade pip' command.\n"
     ]
    }
   ],
   "source": [
    "!pip install requests\n",
    "!pip install xlrd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import matplotlib.pyplot as plt\n",
    "import datetime as dt\n",
    "# common imports\n",
    "import zipfile\n",
    "import time\n",
    "# import datetime, timedelta\n",
    "import datetime\n",
    "from datetime import datetime, timedelta\n",
    "from datetime import date\n",
    "from dateutil import relativedelta\n",
    "from io import StringIO\n",
    "import pandas as pd\n",
    "import pickle\n",
    "from sklearn.base import BaseEstimator\n",
    "from sklearn.base import TransformerMixin\n",
    "from io import StringIO\n",
    "import requests\n",
    "import json\n",
    "from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline \n",
    "import os\n",
    "import math\n",
    "from subprocess import check_output\n",
    "from IPython.display import display\n",
    "import logging\n",
    "import yaml\n",
    "from collections import Counter\n",
    "import re\n",
    "import os\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "current directory is: C:\\personal\\manning\\deep_learning_for_structured_data\\notebooks\n",
      "path_to_yaml C:\\personal\\manning\\deep_learning_for_structured_data\\notebooks\\streetcar_data_preparation_config.yml\n"
     ]
    }
   ],
   "source": [
    "# load config file\n",
    "current_path = os.getcwd()\n",
    "print(\"current directory is: \"+current_path)\n",
    "\n",
    "path_to_yaml = os.path.join(current_path, 'streetcar_data_preparation_config.yml')\n",
    "print(\"path_to_yaml \"+path_to_yaml)\n",
    "try:\n",
    "    with open (path_to_yaml, 'r') as c_file:\n",
    "        config = yaml.safe_load(c_file)\n",
    "except Exception as e:\n",
    "    print('Error reading the config file')\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "# common variables\n",
    "# control whether to load data from scratch from original source or from saved dataframe\n",
    "load_from_scratch = config['general']['load_from_scratch']\n",
    "# control whether to save dataframe with transformed data\n",
    "save_transformed_dataframe = config['general']['save_transformed_dataframe']\n",
    "# control whether rows containing erroneous values are removed from the saved dataset\n",
    "remove_bad_values = config['general']['remove_bad_values']\n",
    "# name of file containing pickled dataframe version of input (unprocessed) dataset\n",
    "pickled_input_dataframe = config['file_names']['pickled_input_dataframe']\n",
    "# name of file to which prepared data set is saved as a pickled dataframe\n",
    "pickled_output_dataframe = config['file_names']['pickled_output_dataframe']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load_from_scratch False\n",
      "save_transformed_dataframe True\n",
      "remove_bad_values True\n",
      "pickled_input_dataframe 2014_2019.pkl\n",
      "pickled_output_dataframe 2014_2019_df_cleaned_remove_bad_values_may16_2020.pkl\n"
     ]
    }
   ],
   "source": [
    "print(\"load_from_scratch \"+str(load_from_scratch))\n",
    "print(\"save_transformed_dataframe \"+str(save_transformed_dataframe))\n",
    "print(\"remove_bad_values \"+str(remove_bad_values))\n",
    "print(\"pickled_input_dataframe \"+str(pickled_input_dataframe))\n",
    "print(\"pickled_output_dataframe \"+str(pickled_output_dataframe))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Streetcar vehicle IDs CLRV/ALRV\n",
    "\n",
    "From https://en.wikipedia.org/wiki/Toronto_streetcar_system_rolling_stock#CLRVs_and_ALRVs\n",
    "\n",
    "<table style=\"border: none\" align=\"left\">\n",
    "   </tr>\n",
    "   <tr style=\"border: none\">\n",
    "       <th style=\"border: none\"><img src=\"https://raw.githubusercontent.com/ryanmark1867/streetcarnov3/master/streetcarCLRV.jpg\" width=\"600\" alt=\"Icon\"> </th>\n",
    "   </tr>\n",
    "</table>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "valid streetcars [4000, 4001, 4002, 4003, 4004, 4005, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4017, 4018, 4019, 4020, 4021, 4022, 4023, 4024, 4025, 4026, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, 4037, 4038, 4039, 4040, 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4048, 4049, 4050, 4051, 4052, 4053, 4054, 4055, 4056, 4057, 4058, 4059, 4060, 4061, 4062, 4063, 4064, 4065, 4066, 4067, 4068, 4069, 4070, 4071, 4072, 4073, 4074, 4075, 4076, 4077, 4078, 4079, 4080, 4081, 4082, 4083, 4084, 4085, 4086, 4087, 4088, 4089, 4090, 4091, 4092, 4093, 4094, 4095, 4096, 4097, 4098, 4099, 4100, 4101, 4102, 4103, 4104, 4105, 4106, 4107, 4108, 4109, 4110, 4111, 4112, 4113, 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, 4172, 4173, 4174, 4175, 4176, 4177, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, 4186, 4187, 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, 4214, 4215, 4216, 4217, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4900, 4400, 4402, 4403, 4404, 4405, 4406, 4407, 4408, 4409, 4410, 4411, 4412, 4413, 4414, 4415, 4416, 4417, 4418, 4419, 4420, 4421, 4422, 4423, 4424, 4425, 4426, 4427, 4428, 4429, 4430, 4431, 4432, 4433, 4434, 4435, 4436, 4437, 4438, 4439, 4440, 4441, 4442, 4443, 4444, 4445, 4446, 4447, 4448, 4449, 4450, 4451, 4452, 4453, 4454, 4455, 4456, 4457, 4458, 4459, 4460, 4461, 4462, 4463, 4464, 4465, 4466, 4467, 4468, 4469, 4470, 4471, 4472, 4473, 4474, 4475, 4476, 4477, 4478, 4479, 4480, 4481, 4482, 4483, 4484, 4485, 4486, 4487, 4488, 4489, 4490, 4491, 4492, 4493, 4494, 4495, 4496, 4497, 4498, 4499, 4500, 4501, 4502, 4503, 4504, 4505, 4506, 4507]\n"
     ]
    }
   ],
   "source": [
    "streetcar_vehicles = list(range(4000,4006))+ list(range(4010,4200)) +  list(range(4200,4252)) + [4900]\n",
    "streetcar_vehicles = streetcar_vehicles + [4400] + list(range(4402,4508))\n",
    "print(\"valid streetcars\",streetcar_vehicles)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Streetcar vehicle IDs Flexity\n",
    "\n",
    "From https://en.wikipedia.org/wiki/Toronto_streetcar_system_rolling_stock#CLRVs_and_ALRVs\n",
    "\n",
    "<table style=\"border: none\" align=\"left\">\n",
    "   </tr>\n",
    "   <tr style=\"border: none\">\n",
    "       <th style=\"border: none\"><img src=\"https://raw.githubusercontent.com/ryanmark1867/streetcarnov3/master/streetcarflexity.jpg\" width=\"600\" alt=\"Icon\"> </th>\n",
    "   </tr>\n",
    "</table>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bus identification\n",
    "The following links define the valid non-streetcar vehicles that can be delayed by streetcar incidents\n",
    "\n",
    "- Buses 1xxx: https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_1000-1149\n",
    "- Buses 2xxx: https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_2000-2110,_2150-2155,_2240-2485,_2600-2619,_2700-2765,_2767-2858\n",
    "- Buses 70xx: https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_7000-7134\n",
    "- Buses 74xx: https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_7400-7499,_7500-7619,_7620-7881\n",
    "- Buses 8xxx: https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_8000-8099\n",
    "- Buses 9xxx: https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_9000-9026\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "valid vehicles [4000, 4001, 4002, 4003, 4004, 4005, 4010, 4011, 4012, 4013, 4014, 4015, 4016, 4017, 4018, 4019, 4020, 4021, 4022, 4023, 4024, 4025, 4026, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, 4037, 4038, 4039, 4040, 4041, 4042, 4043, 4044, 4045, 4046, 4047, 4048, 4049, 4050, 4051, 4052, 4053, 4054, 4055, 4056, 4057, 4058, 4059, 4060, 4061, 4062, 4063, 4064, 4065, 4066, 4067, 4068, 4069, 4070, 4071, 4072, 4073, 4074, 4075, 4076, 4077, 4078, 4079, 4080, 4081, 4082, 4083, 4084, 4085, 4086, 4087, 4088, 4089, 4090, 4091, 4092, 4093, 4094, 4095, 4096, 4097, 4098, 4099, 4100, 4101, 4102, 4103, 4104, 4105, 4106, 4107, 4108, 4109, 4110, 4111, 4112, 4113, 4114, 4115, 4116, 4117, 4118, 4119, 4120, 4121, 4122, 4123, 4124, 4125, 4126, 4127, 4128, 4129, 4130, 4131, 4132, 4133, 4134, 4135, 4136, 4137, 4138, 4139, 4140, 4141, 4142, 4143, 4144, 4145, 4146, 4147, 4148, 4149, 4150, 4151, 4152, 4153, 4154, 4155, 4156, 4157, 4158, 4159, 4160, 4161, 4162, 4163, 4164, 4165, 4166, 4167, 4168, 4169, 4170, 4171, 4172, 4173, 4174, 4175, 4176, 4177, 4178, 4179, 4180, 4181, 4182, 4183, 4184, 4185, 4186, 4187, 4188, 4189, 4190, 4191, 4192, 4193, 4194, 4195, 4196, 4197, 4198, 4199, 4200, 4201, 4202, 4203, 4204, 4205, 4206, 4207, 4208, 4209, 4210, 4211, 4212, 4213, 4214, 4215, 4216, 4217, 4218, 4219, 4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, 4228, 4229, 4230, 4231, 4232, 4233, 4234, 4235, 4236, 4237, 4238, 4239, 4240, 4241, 4242, 4243, 4244, 4245, 4246, 4247, 4248, 4249, 4250, 4251, 4900, 4400, 4402, 4403, 4404, 4405, 4406, 4407, 4408, 4409, 4410, 4411, 4412, 4413, 4414, 4415, 4416, 4417, 4418, 4419, 4420, 4421, 4422, 4423, 4424, 4425, 4426, 4427, 4428, 4429, 4430, 4431, 4432, 4433, 4434, 4435, 4436, 4437, 4438, 4439, 4440, 4441, 4442, 4443, 4444, 4445, 4446, 4447, 4448, 4449, 4450, 4451, 4452, 4453, 4454, 4455, 4456, 4457, 4458, 4459, 4460, 4461, 4462, 4463, 4464, 4465, 4466, 4467, 4468, 4469, 4470, 4471, 4472, 4473, 4474, 4475, 4476, 4477, 4478, 4479, 4480, 4481, 4482, 4483, 4484, 4485, 4486, 4487, 4488, 4489, 4490, 4491, 4492, 4493, 4494, 4495, 4496, 4497, 4498, 4499, 4500, 4501, 4502, 4503, 4504, 4505, 4506, 4507, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037, 2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048, 2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059, 2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092, 2093, 2094, 2095, 2096, 2097, 2098, 2099, 2100, 2101, 2102, 2103, 2104, 2105, 2106, 2107, 2108, 2109, 2110, 2150, 2151, 2152, 2153, 2154, 2155, 2240, 2241, 2242, 2243, 2244, 2245, 2246, 2247, 2248, 2249, 2250, 2251, 2252, 2253, 2254, 2255, 2256, 2257, 2258, 2259, 2260, 2261, 2262, 2263, 2264, 2265, 2266, 2267, 2268, 2269, 2270, 2271, 2272, 2273, 2274, 2275, 2276, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2285, 2286, 2287, 2288, 2289, 2290, 2291, 2292, 2293, 2294, 2295, 2296, 2297, 2298, 2299, 2300, 2301, 2302, 2303, 2304, 2305, 2306, 2307, 2308, 2309, 2310, 2311, 2312, 2313, 2314, 2315, 2316, 2317, 2318, 2319, 2320, 2321, 2322, 2323, 2324, 2325, 2326, 2327, 2328, 2329, 2330, 2331, 2332, 2333, 2334, 2335, 2336, 2337, 2338, 2339, 2340, 2341, 2342, 2343, 2344, 2345, 2346, 2347, 2348, 2349, 2350, 2351, 2352, 2353, 2354, 2355, 2356, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 2364, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, 2373, 2374, 2375, 2376, 2377, 2378, 2379, 2380, 2381, 2382, 2383, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2392, 2393, 2394, 2395, 2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 2413, 2414, 2415, 2416, 2417, 2418, 2419, 2420, 2421, 2422, 2423, 2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2432, 2433, 2434, 2435, 2436, 2437, 2438, 2439, 2440, 2441, 2442, 2443, 2444, 2445, 2446, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454, 2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465, 2466, 2467, 2468, 2469, 2470, 2471, 2472, 2473, 2474, 2475, 2476, 2477, 2478, 2479, 2480, 2481, 2482, 2483, 2484, 2485, 2600, 2601, 2602, 2603, 2604, 2605, 2606, 2607, 2608, 2609, 2610, 2611, 2612, 2613, 2614, 2615, 2616, 2617, 2618, 2619, 2700, 2701, 2702, 2703, 2704, 2705, 2706, 2707, 2708, 2709, 2710, 2711, 2712, 2713, 2714, 2715, 2716, 2717, 2718, 2719, 2720, 2721, 2722, 2723, 2724, 2725, 2726, 2727, 2728, 2729, 2730, 2731, 2732, 2733, 2734, 2735, 2736, 2737, 2738, 2739, 2740, 2741, 2742, 2743, 2744, 2745, 2746, 2747, 2748, 2749, 2750, 2751, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2763, 2764, 2765, 2767, 2768, 2769, 2770, 2771, 2772, 2773, 2774, 2775, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2799, 2800, 2801, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, 2811, 2812, 2813, 2814, 2815, 2816, 2817, 2818, 2819, 2820, 2821, 2822, 2823, 2824, 2825, 2826, 2827, 2828, 2829, 2830, 2831, 2832, 2833, 2834, 2835, 2836, 2837, 2838, 2839, 2840, 2841, 2842, 2843, 2844, 2845, 2846, 2847, 2848, 2849, 2850, 2851, 2852, 2853, 2854, 2855, 2856, 2857, 2858, 7000, 7001, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7009, 7010, 7011, 7012, 7013, 7014, 7015, 7016, 7017, 7018, 7019, 7020, 7021, 7022, 7023, 7024, 7025, 7026, 7027, 7028, 7029, 7030, 7031, 7032, 7033, 7034, 7035, 7036, 7037, 7038, 7039, 7040, 7041, 7042, 7043, 7044, 7045, 7046, 7047, 7048, 7049, 7050, 7051, 7052, 7053, 7054, 7055, 7056, 7057, 7058, 7059, 7060, 7061, 7062, 7063, 7064, 7065, 7066, 7067, 7068, 7069, 7070, 7071, 7072, 7073, 7074, 7075, 7076, 7077, 7078, 7079, 7080, 7081, 7082, 7083, 7084, 7085, 7086, 7087, 7088, 7089, 7090, 7091, 7092, 7093, 7094, 7095, 7096, 7097, 7098, 7099, 7100, 7101, 7102, 7103, 7104, 7105, 7106, 7107, 7108, 7109, 7110, 7111, 7112, 7113, 7114, 7115, 7116, 7117, 7118, 7119, 7120, 7121, 7122, 7123, 7124, 7125, 7126, 7127, 7128, 7129, 7130, 7131, 7132, 7133, 7134, 7400, 7401, 7402, 7403, 7404, 7405, 7406, 7407, 7408, 7409, 7410, 7411, 7412, 7413, 7414, 7415, 7416, 7417, 7418, 7419, 7420, 7421, 7422, 7423, 7424, 7425, 7426, 7427, 7428, 7429, 7430, 7431, 7432, 7433, 7434, 7435, 7436, 7437, 7438, 7439, 7440, 7441, 7442, 7443, 7444, 7445, 7446, 7447, 7448, 7449, 7500, 7501, 7502, 7503, 7504, 7505, 7506, 7507, 7508, 7509, 7510, 7511, 7512, 7513, 7514, 7515, 7516, 7517, 7518, 7519, 7520, 7521, 7522, 7523, 7524, 7525, 7526, 7527, 7528, 7529, 7530, 7531, 7532, 7533, 7534, 7535, 7536, 7537, 7538, 7539, 7540, 7541, 7542, 7543, 7544, 7545, 7546, 7547, 7548, 7549, 7550, 7551, 7552, 7553, 7554, 7555, 7556, 7557, 7558, 7559, 7560, 7561, 7562, 7563, 7564, 7565, 7566, 7567, 7568, 7569, 7570, 7571, 7572, 7573, 7574, 7575, 7576, 7577, 7578, 7579, 7580, 7581, 7582, 7583, 7584, 7585, 7586, 7587, 7588, 7589, 7590, 7591, 7592, 7593, 7594, 7595, 7596, 7597, 7598, 7599, 7600, 7601, 7602, 7603, 7604, 7605, 7606, 7607, 7608, 7609, 7610, 7611, 7612, 7613, 7614, 7615, 7616, 7617, 7618, 7619, 7620, 7621, 7622, 7623, 7624, 7625, 7626, 7627, 7628, 7629, 7630, 7631, 7632, 7633, 7634, 7635, 7636, 7637, 7638, 7639, 7640, 7641, 7642, 7643, 7644, 7645, 7646, 7647, 7648, 7649, 7650, 7651, 7652, 7653, 7654, 7655, 7656, 7657, 7658, 7659, 7660, 7661, 7662, 7663, 7664, 7665, 7666, 7667, 7668, 7669, 7670, 7671, 7672, 7673, 7674, 7675, 7676, 7677, 7678, 7679, 7680, 7681, 7682, 7683, 7684, 7685, 7686, 7687, 7688, 7689, 7690, 7691, 7692, 7693, 7694, 7695, 7696, 7697, 7698, 7699, 7700, 7701, 7702, 7703, 7704, 7705, 7706, 7707, 7708, 7709, 7710, 7711, 7712, 7713, 7714, 7715, 7716, 7717, 7718, 7719, 7720, 7721, 7722, 7723, 7724, 7725, 7726, 7727, 7728, 7729, 7730, 7731, 7732, 7733, 7734, 7735, 7736, 7737, 7738, 7739, 7740, 7741, 7742, 7743, 7744, 7745, 7746, 7747, 7748, 7749, 7750, 7751, 7752, 7753, 7754, 7755, 7756, 7757, 7758, 7759, 7760, 7761, 7762, 7763, 7764, 7765, 7766, 7767, 7768, 7769, 7770, 7771, 7772, 7773, 7774, 7775, 7776, 7777, 7778, 7779, 7780, 7781, 7782, 7783, 7784, 7785, 7786, 7787, 7788, 7789, 7790, 7791, 7792, 7793, 7794, 7795, 7796, 7797, 7798, 7799, 7800, 7801, 7802, 7803, 7804, 7805, 7806, 7807, 7808, 7809, 7810, 7811, 7812, 7813, 7814, 7815, 7816, 7817, 7818, 7819, 7820, 7821, 7822, 7823, 7824, 7825, 7826, 7827, 7828, 7829, 7830, 7831, 7832, 7833, 7834, 7835, 7836, 7837, 7838, 7839, 7840, 7841, 7842, 7843, 7844, 7845, 7846, 7847, 7848, 7849, 7850, 7851, 7852, 7853, 7854, 7855, 7856, 7857, 7858, 7859, 7860, 7861, 7862, 7863, 7864, 7865, 7866, 7867, 7868, 7869, 7870, 7871, 7872, 7873, 7874, 7875, 7876, 7877, 7878, 7879, 7880, 7881, 8000, 8001, 8002, 8003, 8004, 8005, 8006, 8007, 8008, 8009, 8010, 8011, 8012, 8013, 8014, 8015, 8016, 8017, 8018, 8019, 8020, 8021, 8022, 8023, 8024, 8025, 8026, 8027, 8028, 8029, 8030, 8031, 8032, 8033, 8034, 8035, 8036, 8037, 8038, 8039, 8040, 8041, 8042, 8043, 8044, 8045, 8046, 8047, 8048, 8049, 8050, 8051, 8052, 8053, 8054, 8055, 8056, 8057, 8058, 8059, 8060, 8061, 8062, 8063, 8064, 8065, 8066, 8067, 8068, 8069, 8070, 8071, 8072, 8073, 8074, 8075, 8076, 8077, 8078, 8079, 8080, 8081, 8082, 8083, 8084, 8085, 8086, 8087, 8088, 8089, 8090, 8091, 8092, 8093, 8094, 8095, 8096, 8097, 8098, 8099, 9000, 9001, 9002, 9003, 9004, 9005, 9006, 9007, 9008, 9009, 9010, 9011, 9012, 9013, 9014, 9015, 9016, 9017, 9018, 9019, 9020, 9021, 9022, 9023, 9024, 9025, 9026]\n"
     ]
    }
   ],
   "source": [
    "bus_vehicles = list(range(1000,1150))+ list(range(2000,2111)) + list(range(2150,2156)) + list(range(2240,2486))\n",
    "bus_vehicles = bus_vehicles + list(range(2600,2620)) + list(range(2700,2766)) + list(range(2767,2859))\n",
    "bus_vehicles = bus_vehicles + list(range(7000,7135)) + list(range(7400,7450)) + list(range(7500,7620)) + list(range(7620,7882))\n",
    "bus_vehicles = bus_vehicles + list(range(8000,8100)) + list(range(9000,9027))\n",
    "valid_vehicles = streetcar_vehicles + bus_vehicles\n",
    "print(\"valid vehicles\",valid_vehicles)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Vehicles that are not legitimate subjects of streetcar incidents\n",
    "The following vehicles are not legitimate subjects of streetcar incidents because they run on completely separate tracks (RT and subway) or they have been retired (6xxx buses).\n",
    "\n",
    "- RT cars 3xxx: https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_3000-3027\n",
    "- Subway cars 5xxx https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_5000-5371\n",
    "- Retired Buses 6xxx: https://cptdb.ca/wiki/index.php/Toronto_Transit_Commission_6000-6122\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the valid list of TTC Streetcar routes\n",
    "valid_routes = ['501','502','503','504','505','506','509','510','511','512','301','304','306','310']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['501',\n",
       " '502',\n",
       " '503',\n",
       " '504',\n",
       " '505',\n",
       " '506',\n",
       " '509',\n",
       " '510',\n",
       " '511',\n",
       " '512',\n",
       " '301',\n",
       " '304',\n",
       " '306',\n",
       " '310']"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_routes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "# original valid directions\n",
    "# valid_directions = ['E/B','W/B','N/B','S/B','B/W']\n",
    "# revised valid directions to include lowercasing and removal of '/' and simplify to single letter\n",
    "valid_directions = ['e','w','n','s','b']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid_days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load and Save Data\n",
    "- parse list of XLS files \n",
    "- load XLS files, tab by tab, into dataframe\n",
    "- pickle dataframe for future runs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get the directory for that this notebook is in and return the directory containing data files\n",
    "\n",
    "def get_path():\n",
    "    rawpath = os.getcwd()\n",
    "    # data is in a directory called \"data\" that is a sibling to the directory containing the notebook\n",
    "    path = os.path.abspath(os.path.join(rawpath, '..', 'data'))\n",
    "    return(path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# given a path return the list of xls files in the directory\n",
    "def get_xls_list(path):\n",
    "    files = os.listdir(path)\n",
    "    files_xls = [f for f in files if f[-4:] == 'xlsx']\n",
    "    print(files)\n",
    "    print(files_xls)\n",
    "    return(files_xls)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_xls(path, files_xls, firstfile, firstsheet, df):\n",
    "    '''\n",
    "    load all the tabs of all the XLS files in a list of XLS files, minus tab that has seeded dataframe\n",
    "    \n",
    "    Parameters:\n",
    "    path: directory containing the XLS files\n",
    "    files_xls: list of XLS files\n",
    "    firstfile: file whose first tab has been preloaded\n",
    "    firstsheet: first tab of the file that has been preloaded\n",
    "    df: Pandas dataframe that has been preloaded with the first tab of the first XLS file and is loaded with all the data when the function returns\n",
    "    \n",
    "    Returns:\n",
    "    df: updated dataframe\n",
    "    \n",
    "    '''\n",
    "    for f in files_xls:\n",
    "        print(\"file name\",f)\n",
    "        xlsf = pd.ExcelFile(os.path.join(path,f))\n",
    "        # iterate through sheets\n",
    "        for sheet_name in xlsf.sheet_names:\n",
    "            print(\"sheet_name\",sheet_name)\n",
    "            if (f != firstfile) or (sheet_name != firstsheet):\n",
    "                print(\"sheet_name in loop\",sheet_name)\n",
    "                data = pd.read_excel(os.path.join(path,f),sheet_name=sheet_name)    \n",
    "                df = df.append(data)\n",
    "    return (df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# given a path and a filename, load all the XLS files in the path into a dataframe and save\n",
    "# to the dataframe to the filename\n",
    "def reloader(path,picklename):\n",
    "    # get list of all xls files in the path\n",
    "    files_xls = get_xls_list(path)\n",
    "    print(\"list of xls\",files_xls)\n",
    "    # seed initial tab on initial xls file\n",
    "    dfnew = pd.read_excel(os.path.join(path,files_xls[0]))\n",
    "    # get the list of sheets in the first file\n",
    "    xlsf = pd.ExcelFile(os.path.join(path,files_xls[0]))\n",
    "    # load the remaining tabs from all the other xls\n",
    "    # pass the first file (files_xls[0]) and the first tab in that file (xlsf[0]) explicitly\n",
    "    dflatest = load_xls(path,files_xls,files_xls[0],xlsf.sheet_names[0], dfnew)\n",
    "    # save dataframe to pickle\n",
    "    dflatest.to_pickle(os.path.join(path,picklename))\n",
    "    # return dataframe loaded with all tabs of all xls files\n",
    "    return(dflatest)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define categories for input columns\n",
    "def define_feature_categories(df):\n",
    "    allcols = list(df)\n",
    "    print(\"all cols\",allcols)\n",
    "    textcols = ['Incident','Location'] # \n",
    "    continuouscols = ['Min Delay','Min Gap'] \n",
    "                      # columns to deal with as continuous values - no embeddings\n",
    "    timecols = ['Report Date','Time']\n",
    "    collist = ['Day','Vehicle','Route','Direction']\n",
    "    for col in continuouscols:\n",
    "        df[col] = df[col].astype(float)\n",
    "    print('texcols: ',textcols)\n",
    "    print('continuouscols: ',continuouscols)\n",
    "    print('timecols: ',timecols)\n",
    "    print('collist: ',collist)\n",
    "    return(allcols,textcols,continuouscols,timecols,collist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fill missing values according to the column category\n",
    "def fill_missing(dataset,allcols,textcols,continuouscols,timecols,collist):\n",
    "    logging.debug(\"before mv\")\n",
    "    for col in collist:\n",
    "        dataset[col].fillna(value=\"missing\", inplace=True)\n",
    "    for col in continuouscols:\n",
    "        dataset[col].fillna(value=0.0,inplace=True)\n",
    "    for col in textcols:\n",
    "        dataset[col].fillna(value=\"missing\", inplace=True)\n",
    "    return (dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load dataframe\n",
    "- load pickled dataframe\n",
    "- show info about the dataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read in data, either from original XLS files in data directory or from pickled dataframe containing\n",
    "def ingest_data(path):\n",
    "    if load_from_scratch:\n",
    "        unpickled_df = reloader(path,pickled_input_dataframe)\n",
    "        logging.debug(\"reloader done\")\n",
    "    else:\n",
    "        unpickled_df = pd.read_pickle(os.path.join(path,pickled_input_dataframe))\n",
    "    return(unpickled_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# General cleanup\n",
    "- correct types for Route and Vehicle\n",
    "- fill missing values\n",
    "- create report-date-time index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "# the dataset incorporated some anomalies in the 2019 data, including:\n",
    "# extraneous Incident ID in April 2019 tab\n",
    "# Gap and Delay columns in April and June 2019 tabs for what had otherwise been called Min Gap and Min Delay\n",
    "# this function cleans up these anomalies\n",
    "def fix_anomalous_columns(df):\n",
    "    # for rows where there is NaN in the Min Delay or Min Gap columns, copy over value from Delay or Gap\n",
    "    # df.Temp_Rating.fillna(df.Farheit, inplace=True)\n",
    "    df['Min Delay'].fillna(df['Delay'], inplace=True)\n",
    "    df['Min Gap'].fillna(df['Gap'], inplace=True)\n",
    "    # now that the useful values have been copied from Delay and Gap, remove them\n",
    "    del df['Delay']\n",
    "    del df['Gap']\n",
    "    # remove Incident ID column - it's extraneous\n",
    "    del df['Incident ID']\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_time(date_time_value,time_value):\n",
    "    ''' given a datetime replace the time portion '''\n",
    "     \n",
    "    date_time_value = date_time_value.replace(hour=time_value.hour,minute=time_value.minute,second=time_value.minute)\n",
    "    return(date_time_value)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "def general_cleanup(df):\n",
    "    # ensure Route and Vehicle are strings, not numeric\n",
    "    df['Route'] = df['Route'].astype(str)\n",
    "    df['Vehicle'] = df['Vehicle'].astype(str)\n",
    "    # remove extraneous characters left from Vehicle values being floats\n",
    "    df['Vehicle'] = df['Vehicle'].str[:-2]\n",
    "    # tactical definition of categories\n",
    "    allcols,textcols,continuouscols,timecols,collist = define_feature_categories(df)\n",
    "    # fill in missing values\n",
    "    df.isnull().sum(axis = 0)\n",
    "    df = fix_anomalous_columns(df)\n",
    "    df = fill_missing(df,allcols,textcols,continuouscols,timecols,collist)\n",
    "    # create new column combining date + time (needed for resampling) and make it the index\n",
    "    df['Report Date Time'] = df.apply(lambda x: replace_time(x['Report Date'], x['Time']), axis=1)\n",
    "    df.index = df['Report Date Time']\n",
    "    # return the updated dataframe along with the column category lists\n",
    "    return(df,allcols,textcols,continuouscols,timecols,collist)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up selected columns\n",
    "Some values in the input dataset were entered \"free form\" when they should have been constricted to a pick list. Columns with this problem include:\n",
    "\n",
    "- Route\n",
    "- Vehicle\n",
    "- Direction\n",
    "- Location\n",
    "\n",
    "\n",
    "Each of these have a finite set of valid values. We have to fix the data in these columns where multiple tokens have been used to signify the same real-world entity (e.g. \"roncesvalles yard.\" and \"roncesvalles carhouse\", or where incorrect values have been entered (e.g. Direction that does not correspond with a compass point)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up Route"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_route (x):\n",
    "    if x in valid_routes:\n",
    "        return(x)\n",
    "    else:\n",
    "        return(\"bad route\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "def route_cleanup(df):\n",
    "    print(\"Route count pre cleanup\",df['Route'].nunique())\n",
    "    # df['Route'].value_counts()\n",
    "    # replace bad route with common token\n",
    "    df['Route'] = df['Route'].apply(lambda x:check_route(x))\n",
    "    print(\"route count post cleanup\",df['Route'].nunique())\n",
    "    return(df)    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up Vehicle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_vehicle (x):\n",
    "    if str.isdigit(x):\n",
    "        if int(x) in valid_vehicles:\n",
    "            return x\n",
    "        else:\n",
    "            return(\"bad vehicle\")\n",
    "    else:\n",
    "        return(\"bad vehicle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "def vehicle_cleanup(df):\n",
    "    print(\"Vehicle count pre cleanup\",df['Vehicle'].nunique())\n",
    "    df['Vehicle'] = df['Vehicle'].apply(lambda x:check_vehicle(x))\n",
    "    print(\"Vehicle count post cleanup\",df['Vehicle'].nunique())\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up Direction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_direction (x):\n",
    "    if x in valid_directions:\n",
    "        return(x)\n",
    "    else:\n",
    "        return(\"bad direction\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "def direction_cleanup(df):\n",
    "    print(\"Direction count pre cleanup\",df['Direction'].nunique())\n",
    "    df['Direction'] = df['Direction'].str.lower()\n",
    "    df['Direction'] = df['Direction'].str.replace('/','')\n",
    "    df['Direction'] = df['Direction'].replace({'eastbound':'e','westbound':'w','southbound':'s','northbound':'n'})\n",
    "    df['Direction'] = df['Direction'].replace('b','',regex=True)\n",
    "    df['Direction'] = df['Direction'].apply(lambda x:check_direction(x))\n",
    "    print(\"Direction count post cleanup\",df['Direction'].nunique())\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean up Location"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_conjunction(intersection):\n",
    "    intersection = re.sub(\" *& *\",\" and \",intersection)\n",
    "    intersection = re.sub(\" */ *\",\" and \",intersection)\n",
    "    return(intersection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "def order_location(intersection):\n",
    "    # for any string with the format \"* and *\" if the value before the and is alphabetically\n",
    "    # higher than the value after the and, swap the values\n",
    "    conj = \" and \"\n",
    "    alpha_ordered_intersection = intersection\n",
    "    if conj in intersection:\n",
    "        end_first_street = intersection.find(conj)\n",
    "        if (end_first_street > 0) and (len(intersection) > (end_first_street + len(conj))):\n",
    "            start_second_street = intersection.find(conj) + len(conj)\n",
    "            first_street = intersection[0:end_first_street]\n",
    "            second_street = intersection[start_second_street:]\n",
    "            alpha_ordered_intersection = min(first_street,second_street)+conj+max(first_street,second_street)\n",
    "    return(alpha_ordered_intersection)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "def location_cleanup(df):\n",
    "    print(\"Location count pre cleanup\",df['Location'].nunique())\n",
    "    # make all location values lower case\n",
    "    df['Location'] = df['Location'].str.lower()\n",
    "    # make substitutions to eliminate obvious duplicate tokens\n",
    "    df['Location'] = df['Location'].replace({'broadviewstation':'broadview station',' at ':' and ',' stn':' station',' ave.':'','/':' and ','roncy':'roncesvalles','carhouse':'yard','yard.':'yard','st. clair':'st clair','ronc. ':'roncesvalles ','long branch':'longbranch','garage':'yard','barns':'yard',' & ':' and '}, regex=True)\n",
    "    # put intersection values into consistent order\n",
    "    df['Location'] = df['Location'].apply(lambda x:order_location(x))\n",
    "    print(\"Location count post cleanup\",df['Location'].nunique())\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remove bad rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "# remove rows with bad values\n",
    "def remove_bad(df):\n",
    "    df = df[df.Vehicle != 'bad vehicle']\n",
    "    df = df[df.Direction != 'bad direction']\n",
    "    df = df[df.Route != 'bad route']\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n# get the path for data files\\npath = get_path()\\npickled_input_dataframe = \\'2014_2019_upto_june_from_repo.pkl\\'\\nprint(\"path is \",path)\\n# load route direction and delay data datframes\\ndf = ingest_data(path)\\ndf.head()\\n'"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'''\n",
    "# get the path for data files\n",
    "path = get_path()\n",
    "pickled_input_dataframe = '2014_2019_upto_june_from_repo.pkl'\n",
    "print(\"path is \",path)\n",
    "# load route direction and delay data datframes\n",
    "df = ingest_data(path)\n",
    "df.head()\n",
    "'''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Master cell\n",
    "This cell contains calls to the other functions in this notebook to complete the data preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "path is  C:\\personal\\manning\\deep_learning_for_structured_data\\data\n",
      "number of records:  78525\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 78525 entries, 0 to 814\n",
      "Data columns (total 13 columns):\n",
      "Day            78525 non-null object\n",
      "Delay          3444 non-null float64\n",
      "Direction      78217 non-null object\n",
      "Gap            3434 non-null float64\n",
      "Incident       78525 non-null object\n",
      "Incident ID    889 non-null float64\n",
      "Location       78276 non-null object\n",
      "Min Delay      75004 non-null float64\n",
      "Min Gap        74975 non-null float64\n",
      "Report Date    78525 non-null datetime64[ns]\n",
      "Route          78525 non-null int64\n",
      "Time           78525 non-null object\n",
      "Vehicle        73890 non-null float64\n",
      "dtypes: datetime64[ns](1), float64(6), int64(1), object(5)\n",
      "memory usage: 8.4+ MB\n",
      "df.info() output None\n",
      "df.shape output (78525, 13)\n",
      "df.describe() output              Delay          Gap  Incident ID     Min Delay       Min Gap  \\\n",
      "count  3444.000000  3434.000000   889.000000  75004.000000  74975.000000   \n",
      "mean     14.503194    20.133663     3.901012     12.817570     18.304915   \n",
      "std      38.477453    37.362669     2.861142     30.083595     33.678868   \n",
      "min       0.000000     0.000000     1.000000      0.000000      0.000000   \n",
      "25%       5.000000    10.000000     1.000000      5.000000      9.000000   \n",
      "50%       7.000000    14.000000     5.000000      6.000000     12.000000   \n",
      "75%      11.000000    20.000000     7.000000     12.000000     20.000000   \n",
      "max     996.000000   999.000000    10.000000   1400.000000   4216.000000   \n",
      "\n",
      "              Route        Vehicle  \n",
      "count  78525.000000   73890.000000  \n",
      "mean     500.840688    4405.327920  \n",
      "std       45.366162    1570.424282  \n",
      "min        1.000000       0.000000  \n",
      "25%      501.000000    4077.000000  \n",
      "50%      505.000000    4170.000000  \n",
      "75%      509.000000    4412.000000  \n",
      "max      999.000000  163242.000000  \n",
      "df.types output Day                    object\n",
      "Delay                 float64\n",
      "Direction              object\n",
      "Gap                   float64\n",
      "Incident               object\n",
      "Incident ID           float64\n",
      "Location               object\n",
      "Min Delay             float64\n",
      "Min Gap               float64\n",
      "Report Date    datetime64[ns]\n",
      "Route                   int64\n",
      "Time                   object\n",
      "Vehicle               float64\n",
      "dtype: object\n",
      "all cols ['Day', 'Delay', 'Direction', 'Gap', 'Incident', 'Incident ID', 'Location', 'Min Delay', 'Min Gap', 'Report Date', 'Route', 'Time', 'Vehicle']\n",
      "texcols:  ['Incident', 'Location']\n",
      "continuouscols:  ['Min Delay', 'Min Gap']\n",
      "timecols:  ['Report Date', 'Time']\n",
      "collist:  ['Day', 'Vehicle', 'Route', 'Direction']\n",
      "record count by year pre processing:  Counter({2018: 15612, 2016: 14021, 2017: 13762, 2015: 12221, 2019: 11882, 2014: 11027})\n",
      "Route count pre cleanup 117\n",
      "route count post cleanup 15\n",
      "Vehicle count pre cleanup 2663\n",
      "Vehicle count post cleanup 1019\n",
      "Direction count pre cleanup 101\n",
      "Direction count post cleanup 5\n",
      "Location count pre cleanup 17086\n",
      "Location count post cleanup 11000\n",
      "Bad route count pre: 2544\n",
      "Bad direction count pre: 407\n",
      "Bad vehicle count pre: 14656\n",
      "Bad route count: 0\n",
      "Bad direction count: 0\n",
      "Bad vehicle count: 0\n",
      "df.shape output post removal of bad records  (61553, 11)\n",
      "path is  C:\\personal\\manning\\deep_learning_for_structured_data\\data\n",
      "file_name is  C:\\personal\\manning\\deep_learning_for_structured_data\\data\\2014_2019_df_cleaned_remove_bad_values_may16_2020.pkl\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Day</th>\n",
       "      <th>Direction</th>\n",
       "      <th>Incident</th>\n",
       "      <th>Location</th>\n",
       "      <th>Min Delay</th>\n",
       "      <th>Min Gap</th>\n",
       "      <th>Report Date</th>\n",
       "      <th>Route</th>\n",
       "      <th>Time</th>\n",
       "      <th>Vehicle</th>\n",
       "      <th>Report Date Time</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Report Date Time</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2014-01-02 06:31:31</td>\n",
       "      <td>Thursday</td>\n",
       "      <td>e</td>\n",
       "      <td>Late Leaving Garage</td>\n",
       "      <td>dundas and roncesvalles</td>\n",
       "      <td>4.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>2014-01-02</td>\n",
       "      <td>505</td>\n",
       "      <td>06:31:00</td>\n",
       "      <td>4018</td>\n",
       "      <td>2014-01-02 06:31:31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2014-01-02 12:43:43</td>\n",
       "      <td>Thursday</td>\n",
       "      <td>e</td>\n",
       "      <td>Utilized Off Route</td>\n",
       "      <td>king and shaw</td>\n",
       "      <td>20.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>2014-01-02</td>\n",
       "      <td>504</td>\n",
       "      <td>12:43:00</td>\n",
       "      <td>4128</td>\n",
       "      <td>2014-01-02 12:43:43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2014-01-02 14:01:01</td>\n",
       "      <td>Thursday</td>\n",
       "      <td>w</td>\n",
       "      <td>Held By</td>\n",
       "      <td>bingham and kingston road</td>\n",
       "      <td>13.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>2014-01-02</td>\n",
       "      <td>501</td>\n",
       "      <td>14:01:00</td>\n",
       "      <td>4016</td>\n",
       "      <td>2014-01-02 14:01:01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2014-01-02 14:22:22</td>\n",
       "      <td>Thursday</td>\n",
       "      <td>w</td>\n",
       "      <td>Investigation</td>\n",
       "      <td>king st. and roncesvalles</td>\n",
       "      <td>7.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>2014-01-02</td>\n",
       "      <td>504</td>\n",
       "      <td>14:22:00</td>\n",
       "      <td>4175</td>\n",
       "      <td>2014-01-02 14:22:22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2014-01-02 16:42:42</td>\n",
       "      <td>Thursday</td>\n",
       "      <td>e</td>\n",
       "      <td>Utilized Off Route</td>\n",
       "      <td>bathurst and king</td>\n",
       "      <td>3.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2014-01-02</td>\n",
       "      <td>504</td>\n",
       "      <td>16:42:00</td>\n",
       "      <td>4080</td>\n",
       "      <td>2014-01-02 16:42:42</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          Day Direction             Incident  \\\n",
       "Report Date Time                                               \n",
       "2014-01-02 06:31:31  Thursday         e  Late Leaving Garage   \n",
       "2014-01-02 12:43:43  Thursday         e   Utilized Off Route   \n",
       "2014-01-02 14:01:01  Thursday         w              Held By   \n",
       "2014-01-02 14:22:22  Thursday         w        Investigation   \n",
       "2014-01-02 16:42:42  Thursday         e   Utilized Off Route   \n",
       "\n",
       "                                      Location  Min Delay  Min Gap  \\\n",
       "Report Date Time                                                     \n",
       "2014-01-02 06:31:31    dundas and roncesvalles        4.0      8.0   \n",
       "2014-01-02 12:43:43              king and shaw       20.0     22.0   \n",
       "2014-01-02 14:01:01  bingham and kingston road       13.0     19.0   \n",
       "2014-01-02 14:22:22  king st. and roncesvalles        7.0     11.0   \n",
       "2014-01-02 16:42:42          bathurst and king        3.0      6.0   \n",
       "\n",
       "                    Report Date Route      Time Vehicle    Report Date Time  \n",
       "Report Date Time                                                             \n",
       "2014-01-02 06:31:31  2014-01-02   505  06:31:00    4018 2014-01-02 06:31:31  \n",
       "2014-01-02 12:43:43  2014-01-02   504  12:43:00    4128 2014-01-02 12:43:43  \n",
       "2014-01-02 14:01:01  2014-01-02   501  14:01:00    4016 2014-01-02 14:01:01  \n",
       "2014-01-02 14:22:22  2014-01-02   504  14:22:00    4175 2014-01-02 14:22:22  \n",
       "2014-01-02 16:42:42  2014-01-02   504  16:42:00    4080 2014-01-02 16:42:42  "
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# master cell to call the other functions\n",
    "# get the path for data files\n",
    "path = get_path()\n",
    "print(\"path is \",path)\n",
    "# load route direction and delay data datframes\n",
    "df = ingest_data(path)\n",
    "print(\"number of records: \",len(df.index))\n",
    "print(\"df.info() output\",df.info())\n",
    "print(\"df.shape output\",df.shape)\n",
    "print(\"df.describe() output\",df.describe())\n",
    "print(\"df.types output\",df.dtypes)\n",
    "df,allcols,textcols,continuouscols,timecols,collist = general_cleanup(df)\n",
    "df.head()\n",
    "# get record count by year\n",
    "from collections import Counter\n",
    "df_year = pd.DatetimeIndex(df['Report Date Time']).year\n",
    "print(\"record count by year pre processing: \", str(Counter(df_year)))\n",
    "# check that the values for April 2019 are correct\n",
    "df[df['Report Date Time'].astype(str).str[:7]=='2019-04']\n",
    "# cleanup Route\n",
    "logging.debug(\"df.shape output pre route cleanup\",df.shape)\n",
    "df = route_cleanup(df) \n",
    "df = vehicle_cleanup(df)\n",
    "df = direction_cleanup(df)\n",
    "df = location_cleanup(df)\n",
    "logging.debug(\"df.shape output post location\",df.shape)\n",
    "print(\"Bad route count pre:\",df[df.Route == 'bad route'].shape[0])\n",
    "print(\"Bad direction count pre:\",df[df.Direction == 'bad direction'].shape[0])\n",
    "print(\"Bad vehicle count pre:\",df[df.Vehicle == 'bad vehicle'].shape[0])\n",
    "if remove_bad_values:\n",
    "    df = remove_bad(df)\n",
    "print(\"Bad route count:\",df[df.Route == 'bad route'].shape[0])\n",
    "print(\"Bad direction count:\",df[df.Direction == 'bad direction'].shape[0])\n",
    "print(\"Bad vehicle count:\",df[df.Vehicle == 'bad vehicle'].shape[0])\n",
    "# pickle the cleansed dataframe\n",
    "print(\"df.shape output post removal of bad records \",df.shape)\n",
    "if save_transformed_dataframe:\n",
    "    print(\"path is \",path)\n",
    "    file_name = os.path.join(path,pickled_output_dataframe)\n",
    "    print(\"file_name is \",file_name)\n",
    "    df.to_pickle(file_name)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({2014: 9371,\n",
       "         2015: 10898,\n",
       "         2016: 11908,\n",
       "         2017: 9891,\n",
       "         2018: 12011,\n",
       "         2019: 7474})"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get record count by year\n",
    "from collections import Counter\n",
    "df_year = pd.DatetimeIndex(df['Report Date Time']).year\n",
    "Counter(df_year)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
